library(conText)
library(tidytext)
library(dplyr)
library(readr)
source("utils.R")
set.seed(42L)

pre_trained <- readRDS("data/wordembeddings/local_glove.rds")

nns_selected <- label_propagate(seed_words =  c("climate"),
                                pre_trained = pre_trained, 
                                N = 100,
                                graphics = F)

# 1 8 9 11 14 15 17 18 19 57 68

climfilt <- paste0(nns_selected, collapse = "|")

# [1] "climate"           "#ClimateEmergency" "environment"       "planet"            "#climatechange"    "pollution"         "environmental"     "nature"            "#ClimateChange"   
# [10] "carbon"            "emissions"        

# use this step to get nearest related words
# then seed these in the tweets, pulling out tweets with one of these terms,
# then get most commonly appearing bigrams across these tweets -- these will be our DV terms
## filter by climate, environmental

load("data/analysis/MPtweetsv2.Rdata")

climtwts <- MPtweets %>%
  filter(grepl(climfilt,tweet, ignore.case = T))

clim_bigrams <- climtwts %>%
  unnest_tokens(bigram, tweet,
                token = "ngrams", n = 2) %>%
  count(bigram, sort = T)

bigrams_tocode <- clim_bigrams[1:4000,]

write.csv(bigrams_tocode, "data/output/clim_bigrams.csv", row.names = F)

# inspected top 4000 bigrams, lowest freq. is 39 appearances
# manually code bigrams, for unigrams and trigrams coded 1 = general; 2 = policy

climgrams_coded <- read.csv("data/output/clim_bigrams_coded2.csv")

clim_general <- climgrams_coded %>%
  filter(general==1|unigram==1|trigram==1) %>%
  select(bigram, n) %>%
  rename(climgram = bigram) %>%
  distinct(climgram, .keep_all = T)

write.csv(clim_general, "data/output/climgenterms.csv", row.names = F)

clim_policy <- climgrams_coded %>%
  filter(policy==1|unigram==2|trigram==2) %>%
  select(bigram, n) %>%
  rename(climgram = bigram) %>%
  distinct(climgram, .keep_all = T)

write.csv(clim_policy, "data/output/climpolicyterms.csv", row.names = F)

climgen <- paste0(clim_general$climgram, collapse = "|")
saveRDS(climgen, "data/output/climgenterms.rds")

climpol <- paste0(clim_policy$climgram, collapse = "|")
saveRDS(climpol, "data/output/climpolicyterms.rds")

#count general clim. tweets
climgentwts <- MPtweets %>%
  filter(grepl(climgen,
               tweet, ignore.case = T))
# 15186 tweets

# get sample to check for climate skeptic positions
climgentwts_tocode <- climgentwts %>%
  sample_n(500) %>%
  select(date, username, tweet)

write.csv(climgentwts_tocode, "data/output/climtwts_tocode.csv", row.names = F)
# 0 tweets expressing any sort of climate skeptic position

climpoltwts <- MPtweets %>%
  filter(grepl(climpol,
               tweet, ignore.case = T))
#17820 tweets

speeches <- read_csv("data/output/speeches.csv")

climgenspchs <- speeches %>%
  filter(grepl(climgen,
               speech_text, ignore.case = T))
# 1774 speeches

# get sample to check for climate skeptic positions
climgenspchs_tocode <- climgenspchs %>%
  sample_n(500) %>%
  select(speaker_name, speech_date, speech_class, speech_text)

write.csv(climgenspchs_tocode, "data/output/climspchs_tocode.csv", row.names = F)
# 0 expressing any sort of climate skeptic position
